#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-11-01 22:08:22
# Project: ganji_job

from pyspider.libs.base_handler import *
import re
import random
import MySQLdb
import time


class Handler(BaseHandler):
    default_header = [
        {
            'User-Agent': 'Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'},
        {
            'User-Agent': 'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'},
        {'User-Agent': 'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0;'},
        {'User-Agent': 'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)'},
        {'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)'},
        {'User-Agent': 'Mozilla/4.0(compatible;MSIE6.0;WindowsNT5.1)'},
        {'User-Agent': 'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1'},
        {'User-Agent': 'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1'},
        {'User-Agent': 'Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11'},
        {'User-Agent': 'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11'},
        {
            'User-Agent': 'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'},
        {'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0)'},
        {'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;TencentTraveler4.0)'},
        {'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)'},
        {'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;TheWorld)'},
        {
            'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Trident/4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)'},
        {'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;360SE)'},
        {'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;AvantBrowser)'},
        {'User-Agent': 'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)'}
    ]
    login_cookies = {'ganji_uuid': '8521712200318470401733',
                     ' ganji_xuuid': '5ac67d89-bbc1-4fb9-c53e-0d2a5e94f32d.1541061507145',
                     ' xxzl_deviceid': 'Ekt1HkxlXnahQSXStFfRk1HxVKHoquZlAL8xs8UG9qbzmlT9gmPYPYfRyGQP%2Bk2L', ' lg': '1',
                     ' citydomain': 'bj',
                     ' __utmz': '32156897.1541061516.1.1.utmcsr=bj.ganji.com|utmccn=(referral)|utmcmd=referral|utmcct=/',
                     ' WantedListPageScreenType': '1920',
                     ' gj_footprint': '%5B%5B%22%5Cu552e%5Cu524d%5Cu5de5%5Cu7a0b%5Cu5e08%22%2C%22%5C%2Fzpsqgongchengshi%5C%2F%22%5D%2C%5B%22%5Cu6280%5Cu5de5%5C%2F%5Cu5de5%5Cu4eba%22%2C%22%5C%2Fzpjigongyibangongren%5C%2F%22%5D%2C%5B%22%5Cu9500%5Cu552e%22%2C%22%5C%2Fzpshichangyingxiao%5C%2F%22%5D%5D',
                     ' sscode': 'g9IFZ%2BZSH3u9JEjgg93avdJO', ' GanjiUserName': '%23t_802660659',
                     ' GanjiUserInfo': '%7B%22user_id%22%3A802660659%2C%22email%22%3A%22%22%2C%22username%22%3A%22%23t_802660659%22%2C%22user_name%22%3A%22%23t_802660659%22%2C%22nickname%22%3A%22%22%7D',
                     ' bizs': '%5B%5D', ' xxzl_smartid': '889ef013f0f1d2c5fe36c611ffdfd1f8',
                     ' last_name': '%23t_802660659', ' GanjiLoginType': '1',
                     ' _wap__utmganji_wap_newCaInfo_V2': '%7B%22ca_n%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_i%22%3A%22-%22%7D',
                     ' __utma': '32156897.696892250.1541061516.1541133153.1541135035.7', ' __utmt': '1',
                     ' _gl_tracker': '%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A30422724802%7D',
                     ' GANJISESSID': '4hh2km45ldiqukp9hqsua8uqep', ' ganji_login_act': '1541135310722',
                     ' __utmb': '32156897.4.10.1541135035',
                     ' supercookie': 'BQNlAwLjAwH5WQIyLzH2ZQSwZTExAzAwLmLkAGOyBGqxAQEuLJIxL2L2ZQIyAJDkL2D%3D',
                     ' __utmc1': '32156897'}
    craw_config = {
    }
    list_header = random.choice(default_header)

    page_max = 3

    @every(minutes=24 * 60)
    def on_start(self):
        for page in range(1, self.page_max):
            self.crawl('http://bj.ganji.com/zpbiaoqian/o' + str(page), callback=self.index_page,
                       headers=self.list_header)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('.con-list-zcon  dt.con-list-zp').children('a').items():
            # print(each.attr.href);
            self.crawl(each.attr.href, callback=self.detail_page, headers=self.list_header)

    @config(priority=2)
    def detail_page(self, response):
        html = response.text
        # 获取电话详情页的页面url
        res = re.match(r'.*data-pub-resume-url\s*=\s*\'([^\']+)\'', html, re.M | re.I | re.S)
        if res is None:
            print("none")
        else:
            tel_url = res.group(1)

            self.crawl(tel_url, callback=self.get_tel_from_page, validate_cert=False, cookies=self.login_cookies)

    # 抓取带电话的页
    def get_tel_from_page(self, response):
        tel = response.doc('.apply-pos-v2-tit').children('b').text()
        contact_all = response.doc('.font-grey')
        contract_extra = contact_all.children('em')
        contract = contact_all.text().replace(contract_extra.text(), '')
        return {'tel': tel, 'contact': contract}

    def on_result(self, result):
        if not result:
            return
        print(result)
        insert_row = {
            'tel': result['tel'],
            'contact': result['contact'],
            'created_at': time.time(),
            'updated_at': time.time()
        }
        sql = "replace into ganji_job (`tel`,`contact`,`created_at`,`updated_at`)values ('%(tel)s','%(contact)s',%(created_at)d,%(updated_at)d)" % insert_row
        print(sql)
        db = MySQLdb.connect("localhost", "root", "123456", "yii2", charset='utf8')
        cursor = db.cursor()
        try:
            cursor.execute(sql)
            db.commit()
        except:
            db.rollback()
            print("errr")
        db.close()
        super(Handler, self).on_result(result)




































